/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_ClusterEntity.csv (downloaded 5/22/2018)
			uniprot_pubmed.dta 
			clean_summary.dta
			clean_citation.dta		

Outputs:	clean_entity.dta

*******************************************************************************/

clear all

* Clean PDB entity file and prep for merge
	import delimited using "${data_raw}PDB/pdb_ClusterEntity.csv", clear

	* Rename some variables
	rename structureid structureId
	rename chainid chainId
	rename entityid entityId
	rename clusternumber100 clusterNum100
	rename clusternumber95 clusterNum95
	rename clusternumber90 clusterNum90
	rename uniprotacc uniprotAcc
	rename uniprotrecommendedname uniprotRecName
	rename uniprotalternativenames uniprotAltNames
	rename genename geneName
	rename authorassignedentityname authorAssignedName
	rename taxonomyid taxonomyId	

	sort structureId entityId chainId, stable
		
	* Chains provide no additional detail over entities. Drop repeat obs	
	drop chainId
	duplicates drop
	duplicates tag structureId entityId, gen(dup)
	assert dup == 0
	drop dup

	tempfile entities
	save `entities'

* Prep UniProt data for joinby
	use "${data_raw}UniProt/uniprot_pubmed.dta", clear
		
	drop pubdate //mostly missing
	format journal %30s
		
	order uniprotId jj pmid pubyr issn journal
	sort uniprotId pubyr jj
	
	* Count number of uniprot_ids linked to each pubmed paper
	gen count = 1
	bys pmid: egen numUniprotLinksPmid = sum(count)
	replace numUniprotLinksPmid = . if pmid == .
	drop count

	rename pmid up_pmid
	rename pubyr up_pubyr
	rename issn  up_issn
	rename journal up_journal
	rename jj up_jj

	tempfile uniprot_long
	save `uniprot_long'	


* Merge together summary data, citation data, and entity data
	use "${data_clean}clean_summary.dta", clear
	merge 1:1 structureId using "${data_clean}clean_citation.dta", nogen  
	merge 1:m structureId using `entities'

	keep structureId entityId depositionDate pubmedId uniprotAcc

* Deal with the small number of entites with multiple UniProt IDs
	split uniprotAcc, p(#)
	drop uniprotAcc
	replace uniprotAcc2 = "###" if uniprotAcc2 == ""
	replace uniprotAcc3 = "###" if uniprotAcc3 == ""
	reshape long uniprotAcc, i(structureId entityId) j(jj)
	drop if uniprotAcc == "###"
	rename jj multi_uniprot   // collapse these together later
		
	rename uniprotAcc uniprotId
	
* Join the PDB structures and entities with the uniprot publication data	
	joinby uniprotId using `uniprot_long', unm(master) 
		
	* Flag the Uniprot paper that is the paper directly linking to the PDB
	gen up_pdb_flag = pubmedId == up_pmid
		
	gen depositionYear = year(depositionDate)
	sort depositionYear structureId entityId up_pubyr

	gen up_pdb_year = depositionYear - up_pubyr
	
	* Mark	papers that came prior to PDB (or same year)
	gen before_pdb = up_pdb_year >= 0
	replace before_pdb = 0 if up_pmid == .
	replace before_pdb = 0 if pubmedId == up_pmid & up_pmid != .	

	* Count up papers prior to PDB deposit at the structure-entity level	
	collapse (sum) before_pdb (first) depositionDate pubmedId, 				///
		by(structureId entityId)	

	tempfile before_pdb
	save `before_pdb'

	use `entities'

	merge 1:1 structureId entityId using `before_pdb'

* Count number of entities
	gen count = 1
	bys structureId: egen numEntities = sum(count)
	drop count	
	
* Save
	keep structureId entityId geneName authorAssignedName taxonomy 			///
		taxonomyId before_pdb  clusterNum* numEntities	
	sort structureId entityId
		
	save "${data_clean}clean_entity_values.dta", replace



